Import Statements¶
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import country_converter as coco
import chardet
cc = coco.CountryConverter()
Notebook Presentation¶
pd.options.display.float_format = '{:,.3f}'.format
pd.options.mode.chained_assignment = None
Load the Data¶
The first column in the .csv file just has the row numbers, so it will be used as the index.
# Data source - https://www.kaggle.com/datasets/mayankanand2701/global-covid-19-dataset/data
with open('Global COVID-19 Dataset.csv', 'rb') as f:
result = chardet.detect(f.read())
df = pd.read_csv('Global COVID-19 Dataset.csv', encoding=result['encoding'], index_col=0)
result
{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}
Preliminary Data Exploration¶
df.sample(4)
| Country Name | Cases | Deaths | Recovered | |
|---|---|---|---|---|
| S. No. | ||||
| 163.000 | Qatar | 514,524.000 | 690.000 | 513,834.000 |
| 32.000 | Russia | 24,158,502.000 | 402,821.000 | 23,755,681.000 |
| 2.000 | Bulgaria | 1,329,266.000 | 38,700.000 | 1,290,566.000 |
| 45.000 | Guam | 52,287.000 | 419.000 | 51,868.000 |
df.shape
(238, 4)
df.columns
Index(['Country Name', 'Cases', 'Deaths', 'Recovered'], dtype='object')
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 238 entries, 1.0 to nan Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Country Name 230 non-null object 1 Cases 230 non-null float64 2 Deaths 230 non-null float64 3 Recovered 230 non-null float64 dtypes: float64(3), object(1) memory usage: 9.3+ KB
df.tail(10)
| Country Name | Cases | Deaths | Recovered | |
|---|---|---|---|---|
| S. No. | ||||
| 229.000 | Chad | 7,702.000 | 194.000 | 7,508.000 |
| 230.000 | Burundi | 54,569.000 | 15.000 | 54,554.000 |
| NaN | NaN | NaN | NaN | NaN |
| NaN | NaN | NaN | NaN | NaN |
| NaN | NaN | NaN | NaN | NaN |
| NaN | NaN | NaN | NaN | NaN |
| NaN | NaN | NaN | NaN | NaN |
| NaN | NaN | NaN | NaN | NaN |
| NaN | NaN | NaN | NaN | NaN |
| NaN | NaN | NaN | NaN | NaN |
Data Cleaning¶
df.isna().values.any()
True
df.isna().values.sum()
32
df.isna().values
col_subset = ['Country Name', 'Cases', 'Deaths', 'Recovered']
df.loc[df.Cases.isna()][col_subset]
| Country Name | Cases | Deaths | Recovered | |
|---|---|---|---|---|
| S. No. | ||||
| NaN | NaN | NaN | NaN | NaN |
| NaN | NaN | NaN | NaN | NaN |
| NaN | NaN | NaN | NaN | NaN |
| NaN | NaN | NaN | NaN | NaN |
| NaN | NaN | NaN | NaN | NaN |
| NaN | NaN | NaN | NaN | NaN |
| NaN | NaN | NaN | NaN | NaN |
| NaN | NaN | NaN | NaN | NaN |
clean_df = df.dropna()
clean_df.duplicated(keep=False).values.any()
False
Characters cleaning: convert the data in the Cases, Deaths, Recovered columns and remove all the non-numeric characters¶
columns_to_clean = ['Cases', 'Deaths', 'Recovered']
for col in columns_to_clean:
clean_df[col] = clean_df[col].astype(str).str.replace(',', "")
# Convert column to a numeric data type
clean_df[col] = pd.to_numeric(clean_df[col]).astype(int)
clean_df
| Country Name | Cases | Deaths | Recovered | |
|---|---|---|---|---|
| S. No. | ||||
| 1.000 | Peru | 4524748 | 220831 | 4303917 |
| 2.000 | Bulgaria | 1329266 | 38700 | 1290566 |
| 3.000 | Bosnia and Herzegovina | 403638 | 16388 | 387250 |
| 4.000 | Hungary | 2230381 | 49051 | 2181330 |
| 5.000 | North Macedonia | 350589 | 9977 | 340612 |
| ... | ... | ... | ... | ... |
| 226.000 | Tajikistan | 17786 | 125 | 17661 |
| 227.000 | Benin | 28036 | 163 | 27873 |
| 228.000 | Niger | 9515 | 315 | 9200 |
| 229.000 | Chad | 7702 | 194 | 7508 |
| 230.000 | Burundi | 54569 | 15 | 54554 |
230 rows × 4 columns
Descriptive Statistics and Additional Calculations¶
clean_df["Death_Rate"] = (clean_df["Deaths"] / clean_df["Cases"]).round(3)
newdf = clean_df
newdf.describe()
| Cases | Deaths | Recovered | Death_Rate | |
|---|---|---|---|---|
| count | 230.000 | 230.000 | 230.000 | 230.000 |
| mean | 4,234,399.339 | 36,262.422 | 4,198,136.917 | 0.013 |
| std | 16,639,054.089 | 134,860.421 | 16,523,705.023 | 0.016 |
| min | 1,403.000 | 1.000 | 1,395.000 | 0.000 |
| 25% | 30,557.000 | 190.250 | 30,240.250 | 0.004 |
| 50% | 224,696.000 | 2,147.500 | 222,791.500 | 0.009 |
| 75% | 1,363,680.250 | 16,367.250 | 1,353,812.500 | 0.017 |
| max | 185,636,059.000 | 1,261,370.000 | 184,374,689.000 | 0.181 |
- Std is greater than the mean df, which may indicate large differences between values and non-normal distribution of the data.
- This distribution is generally expected for a given df.
print(f'Country with the maximum number of cases: {newdf.sort_values(by=["Cases"], ascending=False, ignore_index=True).loc[0, "Country Name"]}')
print(f'Country with the maximum number of deaths: {newdf.sort_values(by=["Deaths"], ascending=False, ignore_index=True).loc[0, "Country Name"]}')
print(f'Country with the maximum number of recovered people: {newdf.sort_values(by=["Recovered"], ascending=False, ignore_index=True).loc[0, "Country Name"]}')
Country with the maximum number of cases: European Union Country with the maximum number of deaths: European Union Country with the maximum number of recovered people: European Union
newdf[newdf["Country Name"] == "European Union"]
| Country Name | Cases | Deaths | Recovered | Death_Rate | |
|---|---|---|---|---|---|
| S. No. | |||||
| 31.000 | European Union | 185636059 | 1261370 | 184374689 | 0.007 |
# We interested in data about individual countries, so let's drop data about EU
newdf.drop(index=31.000, axis=0, inplace=True)
newdf.sort_values(by=["Cases"], ascending=False)
| Country Name | Cases | Deaths | Recovered | Death_Rate | |
|---|---|---|---|---|---|
| S. No. | |||||
| 17.000 | United States | 103436829 | 1186984 | 102249845 | 0.011 |
| 194.000 | China | 99354727 | 122196 | 99232531 | 0.001 |
| 153.000 | India | 45037776 | 533589 | 44504187 | 0.012 |
| 39.000 | France | 38997490 | 168091 | 38829399 | 0.004 |
| 54.000 | Germany | 38437756 | 174979 | 38262777 | 0.005 |
| ... | ... | ... | ... | ... | ... |
| 121.000 | Wallis and Futuna | 3760 | 9 | 3751 | 0.002 |
| 173.000 | Macau | 3514 | 121 | 3393 | 0.034 |
| 159.000 | Saint Pierre and Miquelon | 3426 | 2 | 3424 | 0.001 |
| 193.000 | Tuvalu | 2943 | 1 | 2942 | 0.000 |
| 68.000 | Montserrat | 1403 | 8 | 1395 | 0.006 |
229 rows × 5 columns
# add ISO-3 country codes
newdf["ISO"] = newdf["Country Name"].apply(lambda x: coco.convert(names=x, to='ISO3', not_found=None))
df = newdf
df
| Country Name | Cases | Deaths | Recovered | Death_Rate | ISO | |
|---|---|---|---|---|---|---|
| S. No. | ||||||
| 1.000 | Peru | 4524748 | 220831 | 4303917 | 0.049 | PER |
| 2.000 | Bulgaria | 1329266 | 38700 | 1290566 | 0.029 | BGR |
| 3.000 | Bosnia and Herzegovina | 403638 | 16388 | 387250 | 0.041 | BIH |
| 4.000 | Hungary | 2230381 | 49051 | 2181330 | 0.022 | HUN |
| 5.000 | North Macedonia | 350589 | 9977 | 340612 | 0.028 | MKD |
| ... | ... | ... | ... | ... | ... | ... |
| 226.000 | Tajikistan | 17786 | 125 | 17661 | 0.007 | TJK |
| 227.000 | Benin | 28036 | 163 | 27873 | 0.006 | BEN |
| 228.000 | Niger | 9515 | 315 | 9200 | 0.033 | NER |
| 229.000 | Chad | 7702 | 194 | 7508 | 0.025 | TCD |
| 230.000 | Burundi | 54569 | 15 | 54554 | 0.000 | BDI |
229 rows × 6 columns
Visualise the Features¶
Number of Cases vs Deaths¶
fig = px.scatter(df, x='Cases', y='Deaths', color='Deaths', size='Deaths', hover_name='Country Name',
color_continuous_scale=px.colors.sequential.Bluered)
fig.update_layout(xaxis_title="Number of Cases", yaxis_title="Number of Deaths", height=500,
xaxis=dict(range=[0, 120000000]), yaxis=dict(range=[0, 1300000]))
fig.show()
According to the data set, the maximum number of Covid-19 cases (about 100 million) was observed in 2 countries - the USA and China. Despite the large number of infected people in these countries, the number of deaths varies sharply. Unlike the United States with more than 1 million deaths, in China about 120 thousand deaths have been registered, which is 9,7 times less than in the United States.
India is in 3rd place with 45 million Covid-19 cases, followed in descending order by France, Germany, Brazil, South Korea and Japan.
Case-fatality ratio in the top 10 countries (by Number of Cases) 🔺¶
top10 = df.sort_values(by=["Cases"], ascending=False).iloc[:10]
top10['Death_Rate_Percent'] = top10['Death_Rate'] * 100
top10.head(10)
| Country Name | Cases | Deaths | Recovered | Death_Rate | ISO | Death_Rate_Percent | |
|---|---|---|---|---|---|---|---|
| S. No. | |||||||
| 17.000 | United States | 103436829 | 1186984 | 102249845 | 0.011 | USA | 1.100 |
| 194.000 | China | 99354727 | 122196 | 99232531 | 0.001 | CHN | 0.100 |
| 153.000 | India | 45037776 | 533589 | 44504187 | 0.012 | IND | 1.200 |
| 39.000 | France | 38997490 | 168091 | 38829399 | 0.004 | FRA | 0.400 |
| 54.000 | Germany | 38437756 | 174979 | 38262777 | 0.005 | DEU | 0.500 |
| 22.000 | Brazil | 37519960 | 702116 | 36817844 | 0.019 | BRA | 1.900 |
| 130.000 | South Korea | 34571873 | 35934 | 34535939 | 0.001 | KOR | 0.100 |
| 134.000 | Japan | 33803572 | 74694 | 33728878 | 0.002 | JPN | 0.200 |
| 21.000 | Italy | 26721656 | 197001 | 26524655 | 0.007 | ITA | 0.700 |
| 20.000 | United Kingdom | 24924986 | 232112 | 24692874 | 0.009 | GBR | 0.900 |
plt.figure(figsize=(9,4))
plt.xticks(fontsize=12, rotation=45)
plt.yticks(fontsize=12)
ax1 = plt.gca() # get current axes
ax2 = ax1.twinx()
ax1.bar(top10['Country Name'], top10['Cases'], color='grey')
ax2.plot(top10['Country Name'], top10['Death_Rate_Percent'], color='red', linewidth=2, marker='o')
ax1.set_ylabel('Nr of Covid-19 Cases', fontsize=12, color='Grey')
ax2.set_ylabel('Case-fatality ratio, %', fontsize=12, color='Red')
Text(0, 0.5, 'Case-fatality ratio, %')
The case fatality rate in the top 10 countries by number of Covid-19 cases does not exceed 2%, with the highest values in Brazil and India (1.9% and 1.2%, respectively).
Next question is - Which countries had the highest case-fatality ratio?🔻
Top 20 countries with the most observed case-fatality ratio¶
top20 = df.sort_values(by=["Death_Rate"], ascending=False).iloc[:20]
top20['Death_Rate_Percent'] = top20['Death_Rate'] * 100
# bar chart
fig_bar = px.bar(top20, x='Country Name', y='Death_Rate_Percent', color="Death_Rate_Percent",
text=top20['Death_Rate_Percent'].round(1),
labels={'Country Name':'', "Death_Rate_Percent":'Case-Fatality Ratio, %'},
color_continuous_scale=px.colors.diverging.Portland)
fig_bar.update_layout(height=400)
# choropleth map
fig_map = px.choropleth(top20, locations="ISO", color="Death_Rate_Percent",
hover_name='Country Name', hover_data="Death_Rate_Percent",
projection='natural earth',
color_continuous_scale=px.colors.diverging.Portland)
# Show the figures side by side using HTML divs
from IPython.display import display, HTML
display(HTML('<div>' + fig_bar.to_html(full_html=False) + '<div>' + fig_map.to_html(full_html=False) +'</div>' + '</div>'))
The highest case-fatality ratio reaches 18% in Yemen and exceeds the 2% threshold for most other countries in the Global South.
Number of Covid-19 cases on the World Map¶
fig = px.choropleth(df, locations="ISO",
color="Cases",
hover_name = 'Country Name',
hover_data = "Cases",
projection = 'natural earth',
color_continuous_scale=px.colors.sequential.Sunsetdark,
title = 'Number of Covid-19 cases',
width = 800,
height = 550,
)
fig.show()
Covid-19 case-fatality ratio on the World Map¶
fig = px.choropleth(df, locations="ISO",
color="Death_Rate",
hover_name = 'Country Name',
hover_data = "Death_Rate",
projection = 'natural earth',
color_continuous_scale=px.colors.sequential.Redor,
title = 'Covid-19 Death Rate',
width = 800,
height = 550,
)
fig.show()